In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Activation, Dropout, Flatten, Dense, Input, Layer
from tensorflow.keras.layers import Embedding, LSTM, add, Concatenate, Reshape, concatenate, Bidirectional
from tensorflow.keras.applications import VGG16, ResNet50, DenseNet201
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from textwrap import wrap

plt.rcParams['font.size'] = 12
sns.set_style("dark")
warnings.filterwarnings('ignore')

Image Captioning¶

What is Image Captioning ?

  • Image Captioning is the process of generating textual description of an image. It uses both Natural Language Processing and Computer Vision to generate the captions.
  • This task lies at the intersection of computer vision and natural language processing. Most image captioning systems use an encoder-decoder framework, where an input image is encoded into an intermediate representation of the information in the image, and then decoded into a descriptive text sequence.

CNNs + RNNs (LSTMs)

  • To perform Image Captioning we will require two deep learning models combined into one for the training purpose
  • CNNs extract the features from the image of some vector size aka the vector embeddings. The size of these embeddings depend on the type of pretrained network being used for the feature extraction
  • LSTMs are used for the text generation process. The image embeddings are concatenated with the word embeddings and passed to the LSTM to generate the next word
  • For a more illustrative explanation of this architecture check the Modelling section for a picture representation
In [10]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("adityajn105/flickr8k")

print("Path to dataset files:", path)
Warning: Looks like you're using an outdated `kagglehub` version (installed: 0.3.13), please consider upgrading to the latest version (0.4.0).
Resuming download from 39845888 bytes (1073125275 bytes left)...
Resuming download from https://www.kaggle.com/api/v1/datasets/download/adityajn105/flickr8k?dataset_version_number=1 (39845888/1112971163) bytes left.
100%|██████████| 1.04G/1.04G [04:46<00:00, 3.75MB/s]
Extracting files...

Path to dataset files: C:\Users\musaq\.cache\kagglehub\datasets\adityajn105\flickr8k\versions\1
In [39]:
IMAGE_FOLDER = r"C:\Users\musaq\.cache\kagglehub\datasets\adityajn105\flickr8k\versions\1\Images"
CAPTIONS_FILE = r"C:\Users\musaq\.cache\kagglehub\datasets\adityajn105\flickr8k\versions\1\captions.txt"
In [ ]:
data = pd.read_csv(IMAGE_FOLDER + "/../captions.txt")
data.head()
Out[ ]:
image caption
0 1000268201_693b08cb0e.jpg A child in a pink dress is climbing up a set o...
1 1000268201_693b08cb0e.jpg A girl going into a wooden building .
2 1000268201_693b08cb0e.jpg A little girl climbing into a wooden playhouse .
3 1000268201_693b08cb0e.jpg A little girl climbing the stairs to her playh...
4 1000268201_693b08cb0e.jpg A little girl in a pink dress going into a woo...
In [ ]:
def readImage(path, img_size=224):
    img = load_img(path, color_mode='rgb', target_size=(img_size, img_size))
    img = img_to_array(img)
    img = img/255.
    return img

def display_images(temp_df):
    temp_df = temp_df.reset_index(drop=True)
    plt.figure(figsize=(20, 20))

    for i in range(15):
        plt.subplot(5, 5, i+1)
        plt.subplots_adjust(hspace=0.7, wspace=0.3)

        img_path = f"{IMAGE_FOLDER}/{temp_df.image[i]}"
        image = readImage(img_path)

        plt.imshow(image)
        plt.title("\n".join(wrap(temp_df.caption[i], 20)))
        plt.axis("off")

Visualization¶

  • Images and their corresponding captions
In [ ]:
display_images(data.sample(15))
No description has been provided for this image

Caption Text Preprocessing Steps¶

  • Convert sentences into lowercase
  • Remove special characters and numbers present in the text
  • Remove extra spaces
  • Remove single characters
  • Add a starting and an ending tag to the sentences to indicate the beginning and the ending of a sentence
No description has been provided for this image
In [ ]:
def text_preprocessing(data):
    data['caption'] = data['caption'].apply(lambda x: x.lower())
    data['caption'] = data['caption'].apply(lambda x: x.replace("[^A-Za-z]",""))
    data['caption'] = data['caption'].apply(lambda x: x.replace("\s+"," "))
    data['caption'] = data['caption'].apply(lambda x: " ".join([word for word in x.split() if len(word)>1]))
    data['caption'] = "startseq "+data['caption']+" endseq"
    return data

Preprocessed Text¶

In [25]:
data = text_preprocessing(data)
captions = data['caption'].tolist()
captions[:10]
Out[25]:
['startseq child in pink dress is climbing up set of stairs in an entry way endseq',
 'startseq girl going into wooden building endseq',
 'startseq little girl climbing into wooden playhouse endseq',
 'startseq little girl climbing the stairs to her playhouse endseq',
 'startseq little girl in pink dress going into wooden cabin endseq',
 'startseq black dog and spotted dog are fighting endseq',
 'startseq black dog and tri-colored dog playing with each other on the road endseq',
 'startseq black dog and white dog with brown spots are staring at each other in the street endseq',
 'startseq two dogs of different breeds looking at each other on the road endseq',
 'startseq two dogs on pavement moving toward each other endseq']

Tokenization and Encoded Representation¶

  • The words in a sentence are separated/tokenized and encoded in a one hot representation
  • These encodings are then passed to the embeddings layer to generate word embeddings
No description has been provided for this image
In [ ]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for caption in captions)

images = data['image'].unique().tolist()
nimages = len(images)

split_index = round(0.85*nimages)
train_images = images[:split_index]
val_images = images[split_index:]

train = data[data['image'].isin(train_images)]
test = data[data['image'].isin(val_images)]

train.reset_index(inplace=True,drop=True)
test.reset_index(inplace=True,drop=True)

tokenizer.texts_to_sequences([captions[1]])[0]
Out[ ]:
[1, 18, 315, 63, 195, 116, 2]

Image Feature Extraction¶

  • DenseNet 201 Architecture is used to extract the features from the images
  • Any other pretrained architecture can also be used for extracting features from these images
  • Since the Global Average Pooling layer is selected as the final layer of the DenseNet201 model for our feature extraction, our image embeddings will be a vector of size 1920
No description has been provided for this image
In [41]:
data = pd.read_csv(CAPTIONS_FILE)

img_size = 224
base = DenseNet201(include_top=False, pooling='avg')
fe = Model(inputs=base.input, outputs=base.output)

features = {}

for image in tqdm(data['image'].unique().tolist()):
    try:
        img_path = os.path.join(IMAGE_FOLDER, image)
        img = load_img(img_path, target_size=(img_size, img_size))
        img = img_to_array(img)
        img = img / 255.
        img = np.expand_dims(img, axis=0)

        feature = fe.predict(img, verbose=0)
        features[image] = feature

    except Exception as e:
        print("Error extracting:", image, e)
100%|██████████| 8091/8091 [24:09<00:00,  5.58it/s]  
In [42]:
all_images = data['image'].unique().tolist()
missing = [img for img in all_images if img not in features]

print("Missing features:", len(missing))

for image in missing:
    try:
        img_path = os.path.join(IMAGE_FOLDER, image)
        img = load_img(img_path, target_size=(img_size, img_size))
        img = img_to_array(img)
        img = img / 255.
        img = np.expand_dims(img, axis=0)

        feature = fe.predict(img, verbose=0)
        features[image] = feature
        print("Recovered:", image)

    except Exception as e:
        print("FAILED:", image, e)

print("Final feature count:", len(features))
Missing features: 0
Final feature count: 8091

Data Generation¶

  • Since Image Caption model training like any other neural network training is a highly resource utillizing process we cannot load the data into the main memory all at once, and hence we need to generate the data in the required format batch wise
  • The inputs will be the image embeddings and their corresonding caption text embeddings for the training process
  • The text embeddings are passed word by word for the caption generation during inference time
In [43]:
class CustomDataGenerator(Sequence):
    
    def __init__(self, df, X_col, y_col, batch_size, directory, tokenizer, 
                 vocab_size, max_length, features,shuffle=True):
    
        self.df = df.copy()
        self.X_col = X_col
        self.y_col = y_col
        self.directory = directory
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.features = features
        self.shuffle = shuffle
        self.n = len(self.df)
        
    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)
    
    def __len__(self):
        return self.n // self.batch_size
    
    def __getitem__(self,index):
    
        batch = self.df.iloc[index * self.batch_size:(index + 1) * self.batch_size,:]
        X1, X2, y = self.__get_data(batch)        
        return (X1, X2), y
    
    def __get_data(self,batch):
        
        X1, X2, y = list(), list(), list()
        
        images = batch[self.X_col].tolist()
           
        for image in images:
            feature = self.features[image][0]
            
            captions = batch.loc[batch[self.X_col]==image, self.y_col].tolist()
            for caption in captions:
                seq = self.tokenizer.texts_to_sequences([caption])[0]

                for i in range(1,len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=self.max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=self.vocab_size)[0]
                    X1.append(feature)
                    X2.append(in_seq)
                    y.append(out_seq)
            
        X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                
        return X1, X2, y


train_generator = CustomDataGenerator(df=train,X_col='image',y_col='caption',batch_size=64,directory=image_path,
                                      tokenizer=tokenizer,vocab_size=vocab_size,max_length=max_length,features=features)

validation_generator = CustomDataGenerator(df=test,X_col='image',y_col='caption',batch_size=64,directory=image_path,
                                      tokenizer=tokenizer,vocab_size=vocab_size,max_length=max_length,features=features)

Modelling¶

  • The image embedding representations are concatenated with the first word of sentence ie. starseq and passed to the LSTM network
  • The LSTM network starts generating words after each input thus forming a sentence at the end
In [44]:
from tensorflow.keras.utils import plot_model

input1 = Input(shape=(1920,))
input2 = Input(shape=(max_length,))

img_features = Dense(256, activation='relu')(input1)
img_features_reshaped = Reshape((1, 256), input_shape=(256,))(img_features)

sentence_features = Embedding(vocab_size, 256, mask_zero=False)(input2)
merged = concatenate([img_features_reshaped,sentence_features],axis=1)
sentence_features = LSTM(256)(merged)
x = Dropout(0.5)(sentence_features)
x = add([x, img_features])
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(vocab_size, activation='softmax')(x)

caption_model = Model(inputs=[input1,input2], outputs=output)
caption_model.compile(loss='categorical_crossentropy',optimizer='adam')


from tensorflow.keras.callbacks import ModelCheckpoint

# Define the model checkpoint
model_name = "model.keras"  # Update the extension to .keras
checkpoint = ModelCheckpoint(
    model_name,
    monitor="val_loss",
    mode="min",
    save_best_only=True,
    save_weights_only=False,
    verbose=1
)

earlystopping = EarlyStopping(monitor='val_loss',min_delta = 0, patience = 5, verbose = 1, restore_best_weights=True)

learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.2, 
                                            min_lr=0.00000001)

history = caption_model.fit(
        train_generator,
        epochs=50,
        validation_data=validation_generator,
        callbacks=[checkpoint,earlystopping,learning_rate_reduction])
Epoch 1/50
537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 388ms/step - loss: 5.7148
Epoch 1: val_loss improved from None to 4.28454, saving model to model.keras
537/537 ━━━━━━━━━━━━━━━━━━━━ 228s 421ms/step - loss: 5.0959 - val_loss: 4.2845 - learning_rate: 0.0010
Epoch 2/50
537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 399ms/step - loss: 4.3362
Epoch 2: val_loss improved from 4.28454 to 3.99680, saving model to model.keras
537/537 ━━━━━━━━━━━━━━━━━━━━ 269s 434ms/step - loss: 4.2786 - val_loss: 3.9968 - learning_rate: 0.0010
Epoch 3/50
537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 417ms/step - loss: 4.0488
Epoch 3: val_loss improved from 3.99680 to 3.84010, saving model to model.keras
537/537 ━━━━━━━━━━━━━━━━━━━━ 241s 448ms/step - loss: 4.0061 - val_loss: 3.8401 - learning_rate: 0.0010
Epoch 4/50
537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 422ms/step - loss: 3.8458
Epoch 4: val_loss improved from 3.84010 to 3.73999, saving model to model.keras
537/537 ━━━━━━━━━━━━━━━━━━━━ 245s 456ms/step - loss: 3.8373 - val_loss: 3.7400 - learning_rate: 0.0010
Epoch 5/50
537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 426ms/step - loss: 3.7164
Epoch 5: val_loss improved from 3.73999 to 3.68348, saving model to model.keras
537/537 ━━━━━━━━━━━━━━━━━━━━ 247s 460ms/step - loss: 3.7104 - val_loss: 3.6835 - learning_rate: 0.0010
Epoch 6/50
537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 425ms/step - loss: 3.6117
Epoch 6: val_loss improved from 3.68348 to 3.66124, saving model to model.keras
537/537 ━━━━━━━━━━━━━━━━━━━━ 246s 457ms/step - loss: 3.6104 - val_loss: 3.6612 - learning_rate: 0.0010
Epoch 7/50
537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 420ms/step - loss: 3.5224
Epoch 7: val_loss improved from 3.66124 to 3.64358, saving model to model.keras
537/537 ━━━━━━━━━━━━━━━━━━━━ 247s 461ms/step - loss: 3.5291 - val_loss: 3.6436 - learning_rate: 0.0010
Epoch 8/50
537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 419ms/step - loss: 3.4494
Epoch 8: val_loss improved from 3.64358 to 3.62544, saving model to model.keras
537/537 ━━━━━━━━━━━━━━━━━━━━ 243s 452ms/step - loss: 3.4613 - val_loss: 3.6254 - learning_rate: 0.0010
Epoch 9/50
537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 424ms/step - loss: 3.3864
Epoch 9: val_loss improved from 3.62544 to 3.61507, saving model to model.keras
537/537 ━━━━━━━━━━━━━━━━━━━━ 246s 458ms/step - loss: 3.3953 - val_loss: 3.6151 - learning_rate: 0.0010
Epoch 10/50
537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 419ms/step - loss: 3.3339
Epoch 10: val_loss improved from 3.61507 to 3.61181, saving model to model.keras
537/537 ━━━━━━━━━━━━━━━━━━━━ 242s 451ms/step - loss: 3.3483 - val_loss: 3.6118 - learning_rate: 0.0010
Epoch 11/50
537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 433ms/step - loss: 3.2865
Epoch 11: val_loss did not improve from 3.61181
537/537 ━━━━━━━━━━━━━━━━━━━━ 249s 465ms/step - loss: 3.3003 - val_loss: 3.6374 - learning_rate: 0.0010
Epoch 12/50
537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 410ms/step - loss: 3.2425
Epoch 12: val_loss did not improve from 3.61181
537/537 ━━━━━━━━━━━━━━━━━━━━ 237s 441ms/step - loss: 3.2567 - val_loss: 3.6462 - learning_rate: 0.0010
Epoch 13/50
537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 409ms/step - loss: 3.2063
Epoch 13: val_loss did not improve from 3.61181

Epoch 13: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
537/537 ━━━━━━━━━━━━━━━━━━━━ 237s 441ms/step - loss: 3.2202 - val_loss: 3.6673 - learning_rate: 0.0010
Epoch 14/50
537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 423ms/step - loss: 3.1140
Epoch 14: val_loss did not improve from 3.61181
537/537 ━━━━━━━━━━━━━━━━━━━━ 245s 457ms/step - loss: 3.1147 - val_loss: 3.6657 - learning_rate: 2.0000e-04
Epoch 15/50
537/537 ━━━━━━━━━━━━━━━━━━━━ 0s 429ms/step - loss: 3.0936
Epoch 15: val_loss did not improve from 3.61181
537/537 ━━━━━━━━━━━━━━━━━━━━ 248s 463ms/step - loss: 3.0907 - val_loss: 3.6787 - learning_rate: 2.0000e-04
Epoch 15: early stopping
Restoring model weights from the end of the best epoch: 10.
In [45]:
print(len(os.listdir(IMAGE_FOLDER)))
print(data['image'].nunique())
8091
8091

Learning Curve¶

  • The model has clearly overfit, possibly due to less amount of data
  • We can tackle this problem in two ways
    1. Train the model on a larger dataset Flickr40k
    2. Attention Models
In [46]:
plt.figure(figsize=(20,8))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()
No description has been provided for this image

Caption Generation Utility Functions¶

  • Utility functions to generate the captions of input images at the inference time.
  • Here the image embeddings are passed along with the first word, followed by which the text embedding of each new word is passed to generate the next word
In [54]:
import pickle

# Save the tokenizer
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# Save the feature extractor model
fe.save("feature_extractor.keras")
In [55]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import matplotlib.pyplot as plt
import pickle

# load save files
model_path = "model.keras"
tokenizer_path = "tokenizer.pkl"
feature_extractor_path = "feature_extractor.keras"


def generate_and_display_caption(image_path, model_path, tokenizer_path, feature_extractor_path, max_length=34, img_size=224):
    # Load the trained models and tokenizer
    caption_model = load_model(model_path)
    feature_extractor = load_model(feature_extractor_path)

    with open(tokenizer_path, "rb") as f:
        tokenizer = pickle.load(f)

    # Preprocess the image
    img = load_img(image_path, target_size=(img_size, img_size))
    img = img_to_array(img) / 255.0  # Normalize pixel values
    img = np.expand_dims(img, axis=0)
    image_features = feature_extractor.predict(img, verbose=0)  # Extract image features
    
    # Generate the caption
    in_text = "startseq"
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = caption_model.predict([image_features, sequence], verbose=0)
        yhat_index = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat_index, None)
        if word is None:
            break
        in_text += " " + word
        if word == "endseq":
            break
    caption = in_text.replace("startseq", "").replace("endseq", "").strip()

    # Display the image with the generated caption
    img = load_img(image_path, target_size=(img_size, img_size))
    plt.figure(figsize=(8, 8))
    plt.imshow(img)
    plt.axis('off')
    plt.title(caption, fontsize=16, color='blue')
    plt.show()
In [56]:
import os
print(os.listdir())
['.gitignore', 'feature_extractor.keras', 'flickr8k-image-captioning-using-cnns-lstms (1).ipynb', 'flickr8k-image-captioning-using-cnns-lstms.ipynb', 'img.png', 'img_1.png', 'img_2.png', 'img_3.png', 'main.py', 'model.keras', 'README.md', 'tokenizer.pkl', 'uploaded_image.jpg']
In [58]:
# Example usage
image_path = "img_1.png"
generate_and_display_caption(image_path, "model.keras", "tokenizer.pkl", "feature_extractor.keras")
No description has been provided for this image
In [59]:
image_path = "img_2.png"
generate_and_display_caption(image_path, "model.keras", "tokenizer.pkl", "feature_extractor.keras")
No description has been provided for this image
In [62]:
image_path = "img.png"
generate_and_display_caption(image_path, "model.keras", "tokenizer.pkl", "feature_extractor.keras")
No description has been provided for this image
In [ ]: